library(tidyverse)
library(rmarkdown)    # You need this library to run this template.
library(epuRateFlo)     
library(xlsx)
library(data.table)
library(ggplot2)
library(plotly)
library(lubridate) #date
library(DT)
library(ggmap)
library(broom)
library(rgdal)
library(httr)
library(nycmaps)
library(maps)
library(knitr)
library(kableExtra)
options(scipen = 999)

Data includes pickups information (latitude, longitude, time, etc.) from Uber and 10 other for-hire vehicle (FHV) companies in the New York City area. This analysis will focus on the July-September 2014 period. Note that more data is available and would require a more thorough analysis.

How to use this html file: a “code” button next to the title of this file (top right) gives you the option to show or hide all the code including in this file; alternatively, you can show code for each section separately with the specific “code” button.
Table of contents is available on the left.

# importing the first sheet of the data
aggregate = read.xlsx("uber-tlc-foil-response-master/Aggregate FHV Data.xlsx",1,check.names=FALSE)

# adding day of the week
#aggregate$Day = weekdays(as.Date(aggregate$Date))
#aggregate$Day=factor(aggregate$Day, levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday", "Sunday"))

# We combine 9 companies by suming up the number of rides from each company
#aggregate$other9 = apply(aggregate[,2:10],1,sum)

1 Overall descriptive analysis

In this section, we are looking into the aggregate data from different FHV:
American, Carmel, Dial 7, Diplo, Firstclass, Highclass, Prestige, Skyline, Lyft, Uber, Yellow Taxis, Green Taxis.

We first combine the data from 9 FHV companies (American, Carmel, Dial 7, Diplo, Firstclass, Highclass, Prestige, Skyline, Lyft) as these companies have low number of rides per day.

# We combine 9 companies by suming up the number of rides from each company
aggregate$other9 = apply(aggregate[,2:10],1,sum)
## plot uber data over time
# shows 2 spikes and weekdays/weekends
ggplot(data = aggregate, aes(x = Date, y = Uber))+
  geom_line(color=color.mixo(1))

1.1 Number of rides per day of the week

# adding day of the week
aggregate$Day = weekdays(as.Date(aggregate$Date))
aggregate$Day=factor(aggregate$Day, levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday", "Sunday"))

We plot the number of rides over the July-September 2014 period per day of the week. The following graphs are interactive and hoovering gives you the number of rides for each data point.

# Uber: Sundays+Mondays vs Fridays+Saturdays, over September
ind_month=which(month(aggregate$Date) == 9 & day(aggregate$Date) > 5)
FS = ind_month[which(aggregate$Day[ind_month] %in% c("Friday", "Saturday"))]
SM = ind_month[which(aggregate$Day[ind_month] %in% c("Sunday", "Monday"))]
FS_mean=mean(aggregate$Uber[FS])
SM_mean=mean(aggregate$Uber[SM])
#signif((1-SM_mean/FS_mean)*100,2)


# Gren taxis: Saturdays vs Mondays, over September
ind_month=which(month(aggregate$Date) == 9 & day(aggregate$Date) > 5)
S = ind_month[which(aggregate$Day[ind_month] %in% c("Saturday"))]
M = ind_month[which(aggregate$Day[ind_month] %in% c("Monday"))]
S_mean=mean(aggregate$`Green Taxis`[S])
M_mean=mean(aggregate$`Green Taxis`[M])
#signif((1-M_mean/S_mean)*100,2)

The main observations are the following:

  • For both Uber and Yellow Taxis, we observe a sharp increase in the number of rides early July and early September, which might be due to external circumstances such as school holidays.
  • Uber is having less success on Sundays and Mondays than on Fridays and Saturdays and is showing a 27% decrease in the number of rides (around 29000 rides compared to around 40000, over September)
  • Green taxis are having by far the most success on Saturdays compared to Mondays with more than a 39% increase in the number of rides (more than 61000 rides compared to around 37000 rides, over September).

1.1.1 Uber

myplot = ggplot(data = aggregate, aes(x = Date, y = Uber, group=Day, colour=Day))+
  geom_line() + ylab("Trips per day") + ggtitle("Uber")
ggplotly(myplot)

1.1.2 Yellow Taxis

myplot = ggplot(data = aggregate, aes(x = Date, y = `Yellow Taxis`, group=Day, colour=Day))+
  geom_line() + ylab("Trips per day") + ggtitle("Yellow Taxis")
ggplotly(myplot)

1.1.3 Green Taxis

myplot = ggplot(data = aggregate, aes(x = Date, y = `Green Taxis`, group=Day, colour=Day))+
  geom_line() + ylab("Trips per day") + ggtitle("Green Taxis")
ggplotly(myplot)

1.1.4 Other 9 companies

myplot = ggplot(data = aggregate, aes(x = Date, y = other9, group=Day, colour=Day))+
  geom_line() + ylab("Trips per day") + ggtitle("Other 9")
ggplotly(myplot)

1.2 Market shares

1.2.1 Overall market shares per FHV company

Assuming the 12 companies we have information on cover all the FHV in NYC area, we can sum up all the rides for each FHV company over the July-September period and infer the market share of each company over that 3 months period. We can also calculate the market shares for each month.
For clarity, we use the data where the 9 smallest companies are combined.

ss = apply(aggregate[,2:14],2,sum)
out_global = data.frame(Share = signif((ss/sum(ss[-length(ss)]))*100,3))

aggregate$month = month(aggregate$Date) 

temp = aggregate[,c(2:14,16)] %>% group_by(month) %>% summarize_all(list(sum))
out = cbind(out_global, apply(temp,1, function(x) {x[-1]/sum(x[2:13]) })*100)

total_rides = apply(temp[,-c(1,14)],1,sum)
total_rides = rbind(c(sum(total_rides), total_rides))
rownames(total_rides) = "Number of rides"
out_table = rbind(signif(out[-c(1:9),],3))
colnames(out_table ) = colnames(total_rides) =  c("Overall","July", "August", "September")  

datatable(out_table,  options = list(order=list(list(1,"desc")),searching=FALSE,
                   lengthChange = FALSE,
                   lengthMenu = FALSE,
                   pageLength = FALSE,
                   paging = FALSE,
                   info = FALSE), caption = htmltools::tags$caption(
    style = 'caption-side: bottom; text-align: center;', htmltools::em('Market share of the different providers')
  )) %>% formatStyle(0, fontWeight = 'bold')

Yellow Taxis is the main FHV company in NY with more than 80% of the rides being conducted by their drivers.
Uber is increasing is monthly market share over time (+1.2% between July and September) while Yellow Taxis are decreasing theirs (-1.9% over the same period).

datatable(total_rides,  options = list(order=list(list(1,"desc")),searching=FALSE,
                   lengthChange = FALSE,
                   lengthMenu = FALSE,
                   pageLength = FALSE,
                   paging = FALSE,
                   info = FALSE)) %>% formatStyle(0, fontWeight = 'bold')
#datatable(signif(out[13,],3),  options = list(pageLength = 13, order=list(list(1,"desc"))), caption = htmltools::tags$caption( style = 'caption-side: bottom; text-align: center;', htmltools::em('Market share of the different providers'))) %>% formatStyle(0, fontWeight = 'bold')

It is interesting to note the total number of rides over the 3 months period is above 47 millions.

1.2.2 Per week

Here we increase the resolution of the data and focus on a per week analysis, to see if the trend observed on the monthly data is also visible on the weekly data.
As start/end of the weeks do not coincide with start/end of the months, we focus on weeks 27 to 38 (2014-07-06 to 2014-09-27).
As previously, we look at Yellow Taxis, Green Taxis, Uber and we combine the remaining 9 companies.

aggregate$week = format(aggregate$Date, "%U") #week(aggregate$Date)


# head(aggregate)

# table(aggregate$week) # we discard week 26 and 39 as they have less data points

df_byweek= aggregate[,c(11:13,14,17)] %>% filter(week>26 & week<39) %>% group_by(week) %>% summarize_all(list(sum))
# rename week as first day of the week (Sunday)
df_byweek$Date = aggregate$Date [seq(6,89,7)]
  
# wide to long data
df_byweek_long=melt(df_byweek, id.vars=c("week","Date"))
#head(df2)


ggplot(df_byweek_long, aes(x = Date, y=value, group=variable,color=variable)) + geom_line(size=2) +
  ggtitle ("Total number of trips per week") 

We now look at the market share of the FHV companies, calculated per week.

#ggplot(df2, aes(fill=variable, y=value, x=Date)) + 
#  geom_bar( stat="identity", position="dodge")#position="fill")
percent = df_byweek[,2:5]
percent$sum = apply(percent,1,sum)
percent = percent/percent$sum *100
percent = cbind("Week starting on"=df_byweek$Date,signif(percent[,1:4],3))
datatable(percent, options = list(pageLength = 12,searching=FALSE,
                   lengthChange = FALSE,
                   lengthMenu = FALSE,
                   pageLength = FALSE,
                   paging = FALSE,
                   info = FALSE), caption = htmltools::tags$caption(
    style = 'caption-side: bottom; text-align: center;', htmltools::em('Market share of the different providers')
  )) %>% formatStyle('Week starting on', fontWeight = 'bold')

These observations confirm the previously observed trend and show a steady increase of Uber’s market share and a steady decrease of the Yellow Taxis’.

1.2.3 Per day of the week

We now investigate whether the day of the week has an impact on the market shares, since we saw in Section 1.1 that each FHV has its own trend regarding which day of the week they have more customers.

df_byday= aggregate[,c(11:13,14:15,17)] %>% filter(week>26 & week<39) 
df_byday = df_byday[,c(1:5)]%>% group_by(Day) %>% summarize_all(list(sum))

# wide to long data
df_byday_long=melt(df_byday, id.vars=c("Day"))
#head(df2)


ggplot(df_byday_long, aes(x = Day, y=value, group=variable,color=variable)) + geom_line(size=2) +
  ggtitle ("Total number of trips per week") 

We now look at the market share of the FHV companies, calculated per day.

#ggplot(df2, aes(fill=variable, y=value, x=Date)) + 
#  geom_bar( stat="identity", position="dodge")#position="fill")
percent = df_byday[,2:5]
percent$sum = apply(percent,1,sum)
percent = percent/percent$sum
percent = cbind(Date=df_byday$Day,signif(percent[,1:4]*100,3))
datatable(percent, options = list(pageLength = 12,searching=FALSE,
                   lengthChange = FALSE,
                   lengthMenu = FALSE,
                   pageLength = FALSE,
                   paging = FALSE,
                   info = FALSE), caption = htmltools::tags$caption(
    style = 'caption-side: bottom; text-align: center;', htmltools::em('Market share of the different providers')
  )) %>% formatStyle('Date', fontWeight = 'bold')

In Section 1.1 we showed that the Green taxis were having more customers on Saturdays and this increase in the number of rides also leads to an increase in its market shares on Saturdays.

1.3 Rush hours

In this section, we are looking at the number of rides per hour, averaged over the 3 months period (July-September 2014). There is no data available for the Yellow and Green Taxis; we display data for Uber, Lyft and a combination of the remaining 8 FHV companies.

trips = read.xlsx("uber-tlc-foil-response-master/Aggregate FHV Data.xlsx",2, startRow=2, endRow=170)
# table(trips$Weekday) # check right number of entry per day
df= trips %>% group_by(Hour) %>% summarize_all(list(mean))
# wide to long data
df2=melt(df, id.vars=c("Weekday","Hour"))
df2$variable = factor(df2$variable, levels=c("Uber","Lyft","other.8.bases"), labels=c("Uber","Lyft","other8"))
myplot=  ggplot(df2, aes(x = Hour, y=value, group=variable,color=variable)) + geom_line() +
  scale_color_discrete(name="")+ ylab("")+
  ggtitle ("Average number of trips per hour")
ggplotly(myplot)

We observe that pick hours for Uber is around 17H (5pm), while morning rush hour seems to be around 8am.
The other 8 companies on aggregate also have a pick hour at 8am, while they do not appear to have an afternoon rush.
In contrast, Lyft seems to have more customers at night, between 8pm and 5am.

2 Pickup location of Uber rides


In this section, we are having a quick look at the latitude and longitude of Uber pickups over the July-September 2014 period.

# combine the 3months data into one dataset
X = NULL
for(add in c("jul","aug","sep")){
  temp = fread(paste0("uber-tlc-foil-response-master/uber-trip-data/uber-raw-data-",add,"14.csv"))
  X = rbind(X, temp)  
}

# we download a map of NYC that contains the different neighborhoods.
# in order to not download it at every run of this Rmarkdown, I saved it as a Rdata.
#r <- GET('http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson')
load("polygon_NYC.Rdata")
nyc_neighborhoods <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
nyc_neighborhoods_df <- tidy(nyc_neighborhoods)

## density plot: too much density in manahattan-> hide lots of data
# xx= X[which(X$Base == "B02682"),]
#ggplot() +  geom_polygon(data=nyc_neighborhoods_df, aes(x=long, y=lat, group=group),color="grey",fill=NA) + stat_density2d(aes(x=xx$Lon,y=xx$Lat,fill =  ..level..,alpha=0.001),geom = "polygon",h = .02, n = 300)+scale_alpha( guide = FALSE)+scale_fill_distiller(palette = 'Spectral')


# limit the data to the range of coordinates of the polygons.
X_subset = X %>% filter (Lon>min(nyc_neighborhoods_df$long) & Lon<max(nyc_neighborhoods_df$long)) %>% filter(Lat>min(nyc_neighborhoods_df$lat) & Lat<max(nyc_neighborhoods_df$lat))


ggplot() + 
  geom_polygon(data=nyc_neighborhoods_df, aes(x=long, y=lat, group=group),color="grey",fill=NA) + geom_bin2d(aes(x=X_subset$Lon,y=X_subset$Lat),bins=150,alpha=0.5)+scale_fill_distiller(palette = 'Spectral')+ scale_x_continuous(expand=c(0,0))+ scale_y_continuous(expand=c(0,0))

Thanks to the density plot, we first observe that pickups are concentrated in the Manhattan area and at the JFK airport.
Second, we observe that a lot of pickups are done outside of the NYC area (across the border with New Jersey). These data points should probably be removed to clarify the analysis results and the plots.

3 Perspective

It would be interesting to compare the observations from the number of rides and market shares of the different FHV companies between the July-September 2014 period and the January-June 2015 period.

It would also give much insights to compare the location of the Uber pickups to the one of the other FHV companies, such as the Yellow Taxis. This could potentially highlights a cause of the increase in Uber rides, such as that maybe Uber drivers are more present outside of Manhattan than other FHV companies.

 




A work by Florian Rohart

florian.rohart@gmail.com